suppressPackageStartupMessages(library(gapminder))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(DT))
suppressPackageStartupMessages(library(forcats))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggrepel))
Using the “here” package creates relative paths that allow for better sharing capability, whereas using the baseR directory path makes the path set to only your working directory on your local drive.The here package does what other packages can do but in a more simple, platform-independant way.The here package also allows for the user to not need to set the working directory, which allows for furthur ease in collabortaion. When using a document on github, this can also be completed using the here package, allowing for the path to be connected to github directly, rather than an absolute path on your harddrive.
First, I’ll explore the factors within the gapminder dataset to see what we’re working with.
gapminder$continent %>%
levels()
## [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
str(gapminder$continent)
## Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
gapminder %>%
DT::datatable()
Now we’ll drop Oceana as a factor.
the_north <- c("Africa", "Americas", "Asia", "Europe")
we_the_north <- gapminder %>%
filter(continent %in% the_north)
And now I’ll re-explore our data.
we_the_north$continent %>%
levels()
## [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
str(we_the_north$continent)
## Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
Since the factors Oceania is still shown, we need to specify to drop the level(factor) that we filtered out.
we_the_NORTH <- we_the_north %>%
droplevels()
And now I’ll (re-)re-explore our data.
we_the_NORTH$continent %>%
levels()
## [1] "Africa" "Americas" "Asia" "Europe"
str(we_the_NORTH$continent)
## Factor w/ 4 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
we_the_NORTH %>%
count(continent)
## # A tibble: 4 x 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
Re-order levels based on ascending # of values from top to bottom of plot
we_the_NORTH %>%
filter (lifeExp > 50) %>%
ggplot() +
geom_bar(aes(fct_infreq(continent))) +
coord_flip() +
theme_bw() +
labs(y = "# of Observations", x = "Continent", title = "Occurence of lifeExp higher than 50")
new_data <- we_the_NORTH %>%
filter (lifeExp > 50)
write_csv(new_data, here::here("new_data.csv"))
new_data2 <- read_csv(here::here("new_data.csv"))
## Parsed with column specification:
## cols(
## country = col_character(),
## continent = col_character(),
## year = col_double(),
## lifeExp = col_double(),
## pop = col_double(),
## gdpPercap = col_double()
## )
datatable(new_data2)
Explore new data, what happened?
new_data %>%
levels()
## NULL
str(new_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1189 obs. of 6 variables:
## $ country : Factor w/ 140 levels "Afghanistan",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ continent: Factor w/ 4 levels "Africa","Americas",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 55.2 59.3 64.8 66.2 67.7 ...
## $ pop : int 1282697 1476505 1728137 1984060 2263554 2509048 2780097 3075321 3326498 3428038 ...
## $ gdpPercap: num 1601 1942 2313 2760 3313 ...
new_data2 %>%
levels()
## NULL
str(new_data2)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1189 obs. of 6 variables:
## $ country : chr "Albania" "Albania" "Albania" "Albania" ...
## $ continent: chr "Europe" "Europe" "Europe" "Europe" ...
## $ year : num 1952 1957 1962 1967 1972 ...
## $ lifeExp : num 55.2 59.3 64.8 66.2 67.7 ...
## $ pop : num 1282697 1476505 1728137 1984060 2263554 ...
## $ gdpPercap: num 1601 1942 2313 2760 3313 ...
## - attr(*, "spec")=
## .. cols(
## .. country = col_character(),
## .. continent = col_character(),
## .. year = col_double(),
## .. lifeExp = col_double(),
## .. pop = col_double(),
## .. gdpPercap = col_double()
## .. )
The new data has dropped both country and continent data as factors, and now has them stored as characters. I must re-assign country and continent as factors!
new_data2$country <- factor(new_data2$country)
new_data2$continent <- factor(new_data2$continent)
Plot the re-assigned factors in a plot.
new_data2 %>%
ggplot() +
geom_bar(aes(fct_rev(continent), colour = continent)) +
coord_flip() +
theme_bw() +
labs(x= "# of Observations", y = "Continent")
Order by population size:
new_countries <- c("Canada", "Albania", "Taiwan")
new_factor <- new_data2 %>%
filter(country %in% new_countries)
new_factor %>%
ggplot() +
geom_boxplot(aes(x=fct_reorder(country, pop),y=lifeExp, fill = country)) +
theme_bw() +
labs(x= "Country", y = "Life Expectancy", title = "Countries ordered by decreasing lifeExp")
(Rwanda <- gapminder%>%
filter(country == "Rwanda") %>%
mutate(lifeExp_inc= lifeExp - lag(lifeExp)) %>%
mutate(GDP = gdpPercap*pop) %>%
arrange(year))
## # A tibble: 12 x 8
## country continent year lifeExp pop gdpPercap lifeExp_inc GDP
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Rwanda Africa 1952 40 2534927 493. NA 1.25e9
## 2 Rwanda Africa 1957 41.5 2822082 540. 1.5 1.52e9
## 3 Rwanda Africa 1962 43 3051242 597. 1.5 1.82e9
## 4 Rwanda Africa 1967 44.1 3451079 511. 1.1 1.76e9
## 5 Rwanda Africa 1972 44.6 3992121 591. 0.5 2.36e9
## 6 Rwanda Africa 1977 45 4657072 670. 0.400 3.12e9
## 7 Rwanda Africa 1982 46.2 5507565 882. 1.22 4.86e9
## 8 Rwanda Africa 1987 44.0 6349365 848. -2.20 5.38e9
## 9 Rwanda Africa 1992 23.6 7290203 737. -20.4 5.37e9
## 10 Rwanda Africa 1997 36.1 7212583 590. 12.5 4.26e9
## 11 Rwanda Africa 2002 43.4 7852401 786. 7.33 6.17e9
## 12 Rwanda Africa 2007 46.2 8860588 863. 2.83 7.65e9
Rwanda %>%
ggplot(aes(GDP, lifeExp_inc)) +
geom_point() +
scale_x_log10(labels = scales::comma_format()) +
geom_text(aes(label=year),hjust=0, vjust=1)
#first plot
plot1 <- (Rwanda %>%
ggplot(aes(GDP, lifeExp_inc)) +
geom_point() +
geom_text(aes(label=year),hjust=0, vjust=0))
#better plot
plot2 <- (Rwanda %>%
ggplot(aes(GDP, lifeExp_inc, label= year)) +
geom_point() +
scale_x_log10(labels = scales::comma_format()) +
geom_text_repel() +
labs(y= " Change in life expectancy", x = "Gross GDP", title = "Rwandan census and productivity data") +
theme_bw())
#put both plots side-by-side
grid.arrange(plot1, plot2, nrow = 1, top = "Before and After Data Visualization")
plot3 <- ggplot(Rwanda, aes(x=GDP, y= lifeExp_inc, colour = year)) +
geom_point() +
labs(title = "Rwandan GDP and change in life expectancy (1957-2007)")
plot3 %>%
ggplotly()
##Exercise 5
ggsave(filename = "Betterplot", plot = plot2, device = "jpg", path = here::here())
## Saving 7 x 5 in image